<!DOCTYPE html>
<html lang=en>
<head>
    <!-- so meta -->
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="HandheldFriendly" content="True">
    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
    <meta name="description" content="京东爬虫python+selenium+requests作者-ecin520我们通过一个name保存商品名称1name=input(&amp;quot;请输入商品名\n&amp;quot;)  但是url不允许中文出现，因此我们要进行转码，也就是转化url编码，通过导入urllib模块实现 12from urllib.parse import quotename=quote(name)  我们发现通过关键词搜索，">
<meta name="keywords" content="Python,Spider,Selenium">
<meta property="og:type" content="article">
<meta property="og:title" content="京东爬虫python+selenium+requests">
<meta property="og:url" content="http:&#x2F;&#x2F;yoursite.com&#x2F;2019&#x2F;09&#x2F;29&#x2F;%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests&#x2F;index.html">
<meta property="og:site_name" content="ECIN&#39;S BLOG">
<meta property="og:description" content="京东爬虫python+selenium+requests作者-ecin520我们通过一个name保存商品名称1name=input(&amp;quot;请输入商品名\n&amp;quot;)  但是url不允许中文出现，因此我们要进行转码，也就是转化url编码，通过导入urllib模块实现 12from urllib.parse import quotename=quote(name)  我们发现通过关键词搜索，">
<meta property="og:locale" content="en">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569685800090.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569685863720.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569685931892.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686006029.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686247399.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686060465.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686300334.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686322844.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686344727.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686360138.jpg">
<meta property="og:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569686373162.jpg">
<meta property="og:updated_time" content="2020-01-30T04:26:24.231Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="http:&#x2F;&#x2F;106.15.200.82&#x2F;source&#x2F;1569685800090.jpg">
    
    
        
          
              <link rel="shortcut icon" href="/images/favicon.ico">
          
        
        
          
            <link rel="icon" type="image/png" href="/images/favicon-192x192.png" sizes="192x192">
          
        
        
          
            <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon.png">
          
        
    
    <!-- title -->
    <title>京东爬虫python+selenium+requests</title>
    <!-- styles -->
    <link rel="stylesheet" href="/css/style.css">
    <!-- persian styles -->
    
      <link rel="stylesheet" href="/css/rtl.css">
    
    <!-- rss -->
    
    
</head>

<body class="max-width mx-auto px3 ltr">
    
      <div id="header-post">
  <a id="menu-icon" href="#"><i class="fas fa-bars fa-lg"></i></a>
  <a id="menu-icon-tablet" href="#"><i class="fas fa-bars fa-lg"></i></a>
  <a id="top-icon-tablet" href="#" onclick="$('html, body').animate({ scrollTop: 0 }, 'fast');" style="display:none;"><i class="fas fa-chevron-up fa-lg"></i></a>
  <span id="menu">
    <span id="nav">
      <ul>
         
          <li><a href="/">Home</a></li>
         
          <li><a href="/about/">About</a></li>
         
          <li><a href="/archives/">Writing</a></li>
         
          <li><a href="/projects_url">Projects</a></li>
        
      </ul>
    </span>
    <br/>
    <span id="actions">
      <ul>
        
        <li><a class="icon" href="/2019/09/29/CentOS%E5%AE%89%E8%A3%85Java/"><i class="fas fa-chevron-left" aria-hidden="true" onmouseover="$('#i-prev').toggle();" onmouseout="$('#i-prev').toggle();"></i></a></li>
        
        
        <li><a class="icon" href="/2019/09/29/%E3%80%90%E8%BD%AC%E8%BD%BD%E3%80%91Linux%E4%B9%8Bnginx/"><i class="fas fa-chevron-right" aria-hidden="true" onmouseover="$('#i-next').toggle();" onmouseout="$('#i-next').toggle();"></i></a></li>
        
        <li><a class="icon" href="#" onclick="$('html, body').animate({ scrollTop: 0 }, 'fast');"><i class="fas fa-chevron-up" aria-hidden="true" onmouseover="$('#i-top').toggle();" onmouseout="$('#i-top').toggle();"></i></a></li>
        <li><a class="icon" href="#"><i class="fas fa-share-alt" aria-hidden="true" onmouseover="$('#i-share').toggle();" onmouseout="$('#i-share').toggle();" onclick="$('#share').toggle();return false;"></i></a></li>
      </ul>
      <span id="i-prev" class="info" style="display:none;">Previous post</span>
      <span id="i-next" class="info" style="display:none;">Next post</span>
      <span id="i-top" class="info" style="display:none;">Back to top</span>
      <span id="i-share" class="info" style="display:none;">Share post</span>
    </span>
    <br/>
    <div id="share" style="display: none">
      <ul>
  <li><a class="icon" href="http://www.facebook.com/sharer.php?u=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/" target="_blank" rel="noopener"><i class="fab fa-facebook " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://twitter.com/share?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&text=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-twitter " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://www.linkedin.com/shareArticle?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-linkedin " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://pinterest.com/pin/create/bookmarklet/?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&is_video=false&description=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-pinterest " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="mailto:?subject=京东爬虫python+selenium+requests&body=Check out this article: http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/" target="_blank" rel="noopener"><i class="fas fa-envelope " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://getpocket.com/save?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-get-pocket " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://reddit.com/submit?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-reddit " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://www.stumbleupon.com/submit?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-stumbleupon " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://digg.com/submit?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-digg " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://www.tumblr.com/share/link?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&name=京东爬虫python+selenium+requests&description=" target="_blank" rel="noopener"><i class="fab fa-tumblr " aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://news.ycombinator.com/submitlink?u=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&t=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-hacker-news " aria-hidden="true"></i></a></li>
</ul>

    </div>
    <div id="toc">
      <ol class="toc"><li class="toc-item toc-level-1"><a class="toc-link" href="#京东爬虫python-selenium-requests"><span class="toc-number">1.</span> <span class="toc-text">京东爬虫python+selenium+requests</span></a><ol class="toc-child"><li class="toc-item toc-level-6"><a class="toc-link" href="#作者-ecin520"><span class="toc-number">1.0.0.0.0.1.</span> <span class="toc-text">作者-ecin520</span></a></li></ol></li><li class="toc-item toc-level-5"><a class="toc-link" href="#我们通过一个name保存商品名称"><span class="toc-number">1.0.0.0.1.</span> <span class="toc-text">我们通过一个name保存商品名称</span></a></li></ol></li><li class="toc-item toc-level-4"><a class="toc-link" href="#selenium模块介绍"><span class="toc-number">1.0.0.1.</span> <span class="toc-text">selenium模块介绍</span></a><ol class="toc-child"><li class="toc-item toc-level-6"><a class="toc-link" href="#selenium安装"><span class="toc-number">1.0.0.1.0.1.</span> <span class="toc-text">selenium安装</span></a></li></ol></li></ol></li><li class="toc-item toc-level-4"><a class="toc-link" href="#开始爬虫之旅"><span class="toc-number">1.0.0.2.</span> <span class="toc-text">开始爬虫之旅</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#分析json文本"><span class="toc-number">1.0.0.3.</span> <span class="toc-text">分析json文本</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#xlwings模块导入excel中"><span class="toc-number">1.0.0.4.</span> <span class="toc-text">xlwings模块导入excel中</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#去重"><span class="toc-number">1.0.0.5.</span> <span class="toc-text">去重</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#源代码"><span class="toc-number">1.0.0.6.</span> <span class="toc-text">源代码</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#总结"><span class="toc-number">1.0.0.7.</span> <span class="toc-text">总结</span></a></li></ol></li></ol></li></ol></li></ol>
    </div>
  </span>
</div>

    
    <div class="content index py4">
        
        <article class="post" itemscope itemtype="http://schema.org/BlogPosting">
  <header>
    
    <h1 class="posttitle" itemprop="name headline">
        京东爬虫python+selenium+requests
    </h1>



    <div class="meta">
      <span class="author" itemprop="author" itemscope itemtype="http://schema.org/Person">
        <span itemprop="name">ECIN'S BLOG</span>
      </span>
      
    <div class="postdate">
      
        <time datetime="2019-09-28T16:12:06.000Z" itemprop="datePublished">2019-09-29</time>
        
      
    </div>


      

      
    <div class="article-tag">
        <i class="fas fa-tag"></i>
        <a class="tag-link" href="/tags/Python/" rel="tag">Python</a>, <a class="tag-link" href="/tags/Selenium/" rel="tag">Selenium</a>, <a class="tag-link" href="/tags/Spider/" rel="tag">Spider</a>
    </div>


    </div>
  </header>
  

  <div class="content" itemprop="articleBody">
    <h1 id="京东爬虫python-selenium-requests"><a href="#京东爬虫python-selenium-requests" class="headerlink" title="京东爬虫python+selenium+requests"></a>京东爬虫python+selenium+requests</h1><h6 id="作者-ecin520"><a href="#作者-ecin520" class="headerlink" title="作者-ecin520"></a>作者-ecin520</h6><h5 id="我们通过一个name保存商品名称"><a href="#我们通过一个name保存商品名称" class="headerlink" title="我们通过一个name保存商品名称"></a>我们通过一个name保存商品名称</h5><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">name=input(&quot;请输入商品名\n&quot;)</span><br></pre></td></tr></table></figure>

<p>但是url不允许中文出现，因此我们要进行转码，也就是转化url编码，通过导入urllib模块实现</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">from urllib.parse import quote</span><br><span class="line">name=quote(name)</span><br></pre></td></tr></table></figure>

<p>我们发现通过关键词搜索，网页变化存在规律</p>
<p><img src="http://106.15.200.82/source/1569685800090.jpg" alt="image.png"></p>
<p>于是我们可以套用这个网页来进入京东商品的页面</p>
<p>在京东商品的首页，每个关键词的商品有很多页面，而且网页为动态网页，使用普通的requests模块爬取不太方便，也不太全面，因此可以用selenium测试模块来进行翻页。</p>
<p><img src="http://106.15.200.82/source/1569685863720.jpg" alt="image.png"></p>
<h4 id="selenium模块介绍"><a href="#selenium模块介绍" class="headerlink" title="selenium模块介绍"></a>selenium模块介绍</h4><pre><code>selenium模块是一个浏览器自动化测试框架，是一个用于Web应用程序测试的工具，selenium测试直接运行在浏览器中，就像真正的用户在操作一样,获得的，可以获取网页的动态源码。</code></pre><h6 id="selenium安装"><a href="#selenium安装" class="headerlink" title="selenium安装"></a>selenium安装</h6><p>在配置好环境后，直接命令行输入</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install selenium</span><br></pre></td></tr></table></figure>

<p>安装好后，我们需要安装浏览器驱动——driver，以Chrome为例</p>
<p>下载好chromedriver.exe这个文件，注意，版本号要和你本机的Chrome一致！下载好后，放入Chrome的安装目录下，将此文件加入path环境变量，这样就可以在python中使用Chrome的webdriver了。</p>
<h4 id="开始爬虫之旅"><a href="#开始爬虫之旅" class="headerlink" title="开始爬虫之旅"></a>开始爬虫之旅</h4><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">url=&apos;https://search.jd.com/Search?keyword=&apos;+name+&apos;&amp;enc=utf-8&amp;pvid=aa18c6ce55624fe0a035e154834f062e&apos;#进入输入的关键词也就是商品的页面</span><br><span class="line"></span><br><span class="line">driver=webdriver.Chrome()  #获得webriver驱动</span><br><span class="line">driver.get(url) #进入商品列表的第一页</span><br></pre></td></tr></table></figure>

<p>因为我们的目的比较贪心，想要搜索出来的所有页面的商品的评论都雨露均沾，所以我们写入一个死循环来达到这个目的。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">while True:</span><br><span class="line">	********</span><br><span class="line">	********</span><br><span class="line">	driver.find_element_by_link_text(u&quot;下一页&gt;&quot;).click() #点击下一页</span><br><span class="line">	time.sleep(4)#防止访问速度过快被风ip，暂停4s</span><br></pre></td></tr></table></figure>

<p>这样就能爬完一页的商品继续爬下一页，直到全部爬完。</p>
<p>好了，我们进入了第一页的商品页面，查看源代码</p>
<p><img src="http://106.15.200.82/source/1569685931892.jpg" alt="image.png"></p>
<p>发现可以将每个商品的url通过正则表达式套取出来，我们使用python的re库</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">import re</span><br><span class="line">rex=re.findall(r&apos;&lt;a target=&quot;_blank&quot; title=(.*?)&quot;&gt;&apos;,driver.page_source)</span><br></pre></td></tr></table></figure>

<p>这样我们就获取了商品的名称和其对应的url</p>
<p><img src="http://106.15.200.82/source/1569686006029.jpg" alt="image.png"></p>
<p>好了，这样我们就可以访问每一个商品的页面了。</p>
<p>通过一个循环遍历当前页面的所有商品</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">for i in rex:</span><br><span class="line">    commodityName=re.findall(r&apos;&quot;(.*?)&quot; href&apos;,i)[0]  #获得商品的标题</span><br><span class="line">    commodityId=re.findall(r&apos;item.jd.com/(.*?).html&apos;,i)[0] #获得商品的id</span><br><span class="line">    commodityUrl=&apos;https://item.jd.com/&apos;+commodityId+&apos;.html&apos;#此商品的url</span><br></pre></td></tr></table></figure>

<p>但是当我们进入商品页面时，我们正以为可以直接从页面源代码中获取商品评论而窃喜时，却发现一个致命的要素，京东商品页面也是动态加载的，我们无法从源代码中找到评论的足迹，而且我们点击评论时，url不会发生变化。于是猜测，评论数据是否存在于json中，我们通过Chrome抓取http请求查看其规律。</p>
<p><img src="http://106.15.200.82/source/1569686247399.jpg" alt="image.png"></p>
<p>打开Chrome的开发者界面，点击评论后，发现Network这里多了一个productPageComments，顾名思义，我们猜测这是一个存放评论的请求地址</p>
<p><img src="http://106.15.200.82/source/1569686060465.jpg" alt="image.png"></p>
<p>我们requests这个url后发现</p>
<p><img src="http://106.15.200.82/source/1569686300334.jpg" alt="image.png"></p>
<p>山穷水复疑无路，柳暗花明又一村。发现这是一个json文件，商品的信息都存放在这个json中，我们点击第二页的评论，发现又出现一个类似的url，我们分析多个商品评论的url后发现</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv205&amp;productId=100005603836&amp;score=0&amp;sortType=5&amp;page=0&amp;pageSize=10&amp;isShadowSku=0&amp;fold=1</span><br><span class="line"></span><br><span class="line">https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv205&amp;productId=100005603836&amp;score=0&amp;sortType=5&amp;page=1&amp;pageSize=10&amp;isShadowSku=0&amp;rid=0&amp;fold=1</span><br><span class="line"></span><br><span class="line">https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv3469&amp;productId=100002928171&amp;score=0&amp;sortType=5&amp;page=0&amp;pageSize=10&amp;isShadowSku=0&amp;fold=1</span><br></pre></td></tr></table></figure>

<p>找到了规律。上述url我们可以发现productId=后面的是商品的id，也就是每个商品页面的url后面的一串数字。</p>
<p><img src="http://106.15.200.82/source/1569686322844.jpg" alt="image.png"></p>
<p>page=后面跟的是商品的页数（第几页）。</p>
<p>但是还有一个不同的地方fetchJSON_comment98vv后面的数字却是不一样了，到了这里，不禁陷入了思考，究竟这个数字有什么规律呢，找到了这个数字是不是意味着爬虫成功的机率又提升了呢？答案是确实如此，我们回到刚刚的商品页面的源代码中，在Chrome中Ctrl+F进行搜索这个数字。</p>
<p><img src="http://106.15.200.82/source/1569686344727.jpg" alt="image.png"></p>
<p>这不就是我们想要的那个神秘数字吗？既然看到了，不就可以通过正则表达式匹配出来吗</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">commentVersion = re.findall(r&quot;commentVersion:&apos;(.*?)&apos;,&quot;, response.text)[0]#获得version的id</span><br></pre></td></tr></table></figure>

<p>那么每件商品有多少页呢？</p>
<p><img src="http://106.15.200.82/source/1569686360138.jpg" alt="image.png"></p>
<p>再次观察发现，每个商品评论页面有10条评论，那么8.5万+不就是有8500个左右评论页面吗，我们写一个函数来套取这个数字</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">def getPageNum(str):   #转换字符串函数，如10.7万+转换成107000</span><br><span class="line">    str=str.replace(&apos;+&apos;,&apos;&apos;)</span><br><span class="line">    if &apos;万&apos; in str:</span><br><span class="line">        str=str.replace(&apos;.&apos;,&apos;&apos;)</span><br><span class="line">        str=str.replace(&apos;万&apos;,&apos;&apos;)</span><br><span class="line">        str+=&apos;0000&apos;</span><br><span class="line">    return str      #每个评论页面总共有十条评论，折算一下str/10个页面</span><br></pre></td></tr></table></figure>

<p>这样，我们获取了commentVersion的id，知道了商品id，知道了页数，知道了规律，京东爬虫是不是就可以实现了！通过pageComment获取json源码</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">pageComment = requests.get(</span><br><span class="line">                        &apos;https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&apos;</span><br><span class="line">                        &apos;vv&apos; + commentVersion + &apos;&amp;productId=&apos; + commodityId + &apos;&amp;score=0&amp;sortType=5&amp;page=&apos;+str(j)+&apos;&amp;pageSize=10&apos;</span><br><span class="line">                                                                              &apos;&amp;isShadowSku=0&amp;rid=0&amp;fold=1&apos;, headers=headers).text</span><br></pre></td></tr></table></figure>

<p>将匹配到的json格式串通过json.loads()转化成python格式字符串存入jsonText中</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">Text=re.findall(r&apos;&#123;&quot;productAttr&quot;(.*?)&#125;\);&apos;,homeComment)[0]</span><br><span class="line">Text=&apos;&#123;&quot;productAttr&quot;&apos;+Text+&apos;&#125;&apos;</span><br><span class="line">jsonText=json.loads(Text)#获得的json</span><br></pre></td></tr></table></figure>

<h4 id="分析json文本"><a href="#分析json文本" class="headerlink" title="分析json文本"></a>分析json文本</h4><p>解析获得的json看出其结构</p>
<p><img src="http://106.15.200.82/source/1569686373162.jpg" alt="image.png"></p>
<p>找出了其规律</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">comments = jsonText[&apos;comments&apos;]</span><br><span class="line">for comment in comments:</span><br><span class="line">    print(comment[&apos;id&apos;])</span><br><span class="line">    print(comment[&apos;referenceName&apos;])</span><br><span class="line">    print(comment[&apos;content&apos;])</span><br></pre></td></tr></table></figure>

<p>通过这段代码，我们打印出了商品评论的用户id，此id评论的内容和此商品的名字</p>
<h4 id="xlwings模块导入excel中"><a href="#xlwings模块导入excel中" class="headerlink" title="xlwings模块导入excel中"></a>xlwings模块导入excel中</h4><p>打开excel，获取sheet1，在A1 B1 C1分别写入指定字符串</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">excel.App(visible=False,add_book=False).books.open(r&quot;C:\Users\NICE\Desktop\recoder.xlsx&quot;)#打开excel</span><br><span class="line">sheet=excel.sheets[&apos;Sheet1&apos;]#sheet1</span><br><span class="line">sheet.range(&apos;A1&apos;).value=&quot;商品名称&quot;#A1 B1 C1添加列名称</span><br><span class="line">sheet.range(&apos;B1&apos;).value=&quot;用户id&quot;</span><br><span class="line">sheet.range(&apos;C1&apos;).value=&quot;用户评论&quot;</span><br></pre></td></tr></table></figure>

<p>开始写入商品评论</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line">for comment in comments:</span><br><span class="line">    row = sheet[&apos;A1048576&apos;].end(&apos;up&apos;).row + 1#追加数据的一行在目前填入的内容最大的行数下一行</span><br><span class="line">    print(comment[&apos;id&apos;])</span><br><span class="line">    print(comment[&apos;referenceName&apos;])</span><br><span class="line">    print(comment[&apos;content&apos;])</span><br><span class="line">    sheet.range(&apos;A&apos; + str(row)).value = comment[&apos;referenceName&apos;]</span><br><span class="line">    sheet.range(&apos;B&apos; + str(row)).value = comment[&apos;id&apos;]</span><br><span class="line">    sheet.range(&apos;C&apos; + str(row)).value = comment[&apos;content&apos;]</span><br></pre></td></tr></table></figure>

<h4 id="去重"><a href="#去重" class="headerlink" title="去重"></a>去重</h4><p>为了防止相同的商品或者页面重复访问，我们可以将已经访问过的url放入数组中</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">list=[] #去重数组</span><br><span class="line">if url in list:#避免重复访问</span><br><span class="line">	continue</span><br><span class="line">else:</span><br><span class="line">	list.append(url)</span><br><span class="line">	print(&apos;进行需要的操作&apos;)</span><br></pre></td></tr></table></figure>

<h4 id="源代码"><a href="#源代码" class="headerlink" title="源代码"></a>源代码</h4><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br><span class="line">108</span><br><span class="line">109</span><br><span class="line">110</span><br><span class="line">111</span><br><span class="line">112</span><br><span class="line">113</span><br><span class="line">114</span><br><span class="line">115</span><br><span class="line">116</span><br><span class="line">117</span><br><span class="line">118</span><br><span class="line">119</span><br><span class="line">120</span><br><span class="line">121</span><br><span class="line">122</span><br><span class="line">123</span><br><span class="line">124</span><br><span class="line">125</span><br><span class="line">126</span><br><span class="line">127</span><br><span class="line">128</span><br><span class="line">129</span><br><span class="line">130</span><br><span class="line">131</span><br><span class="line">132</span><br><span class="line">133</span><br><span class="line">134</span><br><span class="line">135</span><br><span class="line">136</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">import</span> json</span><br><span class="line"><span class="keyword">import</span> re</span><br><span class="line"><span class="keyword">import</span> time</span><br><span class="line"><span class="keyword">from</span> urllib.parse <span class="keyword">import</span> quote</span><br><span class="line"><span class="keyword">from</span> selenium <span class="keyword">import</span> webdriver</span><br><span class="line"><span class="keyword">import</span> xlwings <span class="keyword">as</span> excel</span><br><span class="line"></span><br><span class="line"><span class="comment">#请求头</span></span><br><span class="line">headers = &#123;</span><br><span class="line">        <span class="string">'Referer'</span>:<span class="string">'https://www.jd.com/'</span>,<span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">getPageNum</span><span class="params">(str)</span>:</span>   <span class="comment">#转换字符串函数，如10.7万+转换成107000</span></span><br><span class="line">    str=str.replace(<span class="string">'+'</span>,<span class="string">''</span>)</span><br><span class="line">    <span class="keyword">if</span> <span class="string">'万'</span> <span class="keyword">in</span> str:</span><br><span class="line">        str=str.replace(<span class="string">'.'</span>,<span class="string">''</span>)</span><br><span class="line">        str=str.replace(<span class="string">'万'</span>,<span class="string">''</span>)</span><br><span class="line">        str+=<span class="string">'0000'</span></span><br><span class="line">    <span class="keyword">return</span> str      <span class="comment">#每个评论页面总共有十条评论，折算一下str/10个页面</span></span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">getHtml</span><span class="params">(url)</span>:</span>   <span class="comment">#设置响应超时，防止一直等待页面</span></span><br><span class="line">    i=<span class="number">0</span></span><br><span class="line">    <span class="keyword">while</span>(i&lt;=<span class="number">6</span>):</span><br><span class="line">        <span class="keyword">try</span>:</span><br><span class="line">            response=requests.get(url,headers=headers,timeout=<span class="number">5</span>)</span><br><span class="line">            <span class="keyword">return</span> response</span><br><span class="line">        <span class="keyword">except</span> requests.exceptions.RequestException:</span><br><span class="line">            i+=<span class="number">1</span></span><br><span class="line">            print(i)</span><br><span class="line"></span><br><span class="line">name=input(<span class="string">"请输入商品名\n"</span>)</span><br><span class="line">name=quote(name)</span><br><span class="line">url=<span class="string">'https://search.jd.com/Search?keyword='</span>+name+<span class="string">'&amp;enc=utf-8&amp;pvid=aa18c6ce55624fe0a035e154834f062e'</span><span class="comment">#进入输入的关键词也就是商品的页面</span></span><br><span class="line"></span><br><span class="line">driver=webdriver.Chrome()  <span class="comment">#获得webriver驱动</span></span><br><span class="line">driver.get(url) <span class="comment">#进入第一页</span></span><br><span class="line">list=[] <span class="comment">#去重数组</span></span><br><span class="line"></span><br><span class="line">excel.App(visible=<span class="literal">False</span>,add_book=<span class="literal">False</span>).books.open(<span class="string">r"C:\Users\NICE\Desktop\recoder.xlsx"</span>)<span class="comment">#打开excel</span></span><br><span class="line">sheet=excel.sheets[<span class="string">'Sheet1'</span>]<span class="comment">#sheet1</span></span><br><span class="line">sheet.range(<span class="string">'A1'</span>).value=<span class="string">"商品名称"</span><span class="comment">#A1 B1 C1添加列名称</span></span><br><span class="line">sheet.range(<span class="string">'B1'</span>).value=<span class="string">"用户id"</span></span><br><span class="line">sheet.range(<span class="string">'C1'</span>).value=<span class="string">"用户评论"</span></span><br><span class="line"></span><br><span class="line"><span class="keyword">while</span> <span class="literal">True</span>:  <span class="comment">#写一个死循环爬取一件商品的所有页面</span></span><br><span class="line">    rex=re.findall(<span class="string">r'&lt;a target="_blank" title=(.*?)"&gt;'</span>,driver.page_source) <span class="comment">#用正则表达式把主要的内容解析出来，包括title url id</span></span><br><span class="line">    <span class="keyword">for</span> i <span class="keyword">in</span> rex:</span><br><span class="line">        <span class="keyword">try</span>:</span><br><span class="line">            print(i)</span><br><span class="line">            commodityName=re.findall(<span class="string">r'"(.*?)" href'</span>,i)[<span class="number">0</span>]  <span class="comment">#获得商品的标题</span></span><br><span class="line">            commodityId=re.findall(<span class="string">r'item.jd.com/(.*?).html'</span>,i)[<span class="number">0</span>] <span class="comment">#获得商品的id</span></span><br><span class="line">            commodityUrl=<span class="string">'https://item.jd.com/'</span>+commodityId+<span class="string">'.html'</span></span><br><span class="line">            <span class="keyword">if</span> commodityUrl <span class="keyword">in</span> list:<span class="comment">#避免重复访问</span></span><br><span class="line">                <span class="keyword">continue</span></span><br><span class="line">            <span class="keyword">else</span>:</span><br><span class="line">                print(commodityName+<span class="string">" "</span>+commodityUrl)</span><br><span class="line">                <span class="comment"># requests.get(commodityUrl, headers=headers)</span></span><br><span class="line">                response=getHtml(commodityUrl)</span><br><span class="line">                commentVersion = re.findall(<span class="string">r"commentVersion:'(.*?)',"</span>, response.text)[<span class="number">0</span>]<span class="comment">#获得version的id</span></span><br><span class="line">                <span class="comment">#https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv'+commentVersion+'&amp;productId='+commodityId+'&amp;score=0&amp;sortType=5&amp;page=0&amp;pageSize=10&amp;isShadowSku=0&amp;fold=1</span></span><br><span class="line">                <span class="comment">#https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv'+commentVersion+'&amp;productId='+commodityId+'&amp;score=0&amp;sortType=5&amp;page=8&amp;pageSize=10&amp;isShadowSku=0&amp;rid=0&amp;fold=1</span></span><br><span class="line"></span><br><span class="line">                <span class="comment"># homeComment=requests.get('https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98'</span></span><br><span class="line">                <span class="comment">#                          'vv'+commentVersion+'&amp;productId='+commodityId+'&amp;score=0&amp;sortType=5&amp;page=0&amp;pageSize=10'</span></span><br><span class="line">                <span class="comment">#                                                                        '&amp;isShadowSku=0&amp;fold=1',headers=headers).text</span></span><br><span class="line">                homeComment=getHtml(<span class="string">'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98'</span></span><br><span class="line">                                         <span class="string">'vv'</span>+commentVersion+<span class="string">'&amp;productId='</span>+commodityId+<span class="string">'&amp;score=0&amp;sortType=5&amp;page=0&amp;pageSize=10'</span></span><br><span class="line">                                                                                       <span class="string">'&amp;isShadowSku=0&amp;fold=1'</span>,headers=headers).text</span><br><span class="line"></span><br><span class="line">                file = open(<span class="string">'C:\\Users\\NICE\\Desktop\\recoder.json'</span>, <span class="string">'a'</span>)</span><br><span class="line">                file.write(homeComment)</span><br><span class="line">                file.close()</span><br><span class="line">                <span class="keyword">try</span>:</span><br><span class="line">                    numComment=re.findall(<span class="string">r'commentCountStr":"(.*?)"'</span>,homeComment)[<span class="number">0</span>]</span><br><span class="line">                    print(getPageNum(numComment))</span><br><span class="line">                <span class="keyword">except</span>:</span><br><span class="line">                    <span class="keyword">pass</span></span><br><span class="line"></span><br><span class="line">                print(homeComment) <span class="comment">#打印第一页评论</span></span><br><span class="line"></span><br><span class="line">                pages = int(getPageNum(numComment))</span><br><span class="line">                print(<span class="string">'--------------------------------------------------------------------------------------------------------------'</span>)</span><br><span class="line">                Text=re.findall(<span class="string">r'&#123;"productAttr"(.*?)&#125;\);'</span>,homeComment)[<span class="number">0</span>]</span><br><span class="line">                Text=<span class="string">'&#123;"productAttr"'</span>+Text+<span class="string">'&#125;'</span></span><br><span class="line">                jsonText=json.loads(Text)<span class="comment">#获得的json</span></span><br><span class="line">                comments=jsonText[<span class="string">'comments'</span>]</span><br><span class="line">                <span class="keyword">for</span> comment <span class="keyword">in</span> comments:</span><br><span class="line">                    row = sheet[<span class="string">'A1048576'</span>].end(<span class="string">'up'</span>).row + <span class="number">1</span></span><br><span class="line">                    print(comment[<span class="string">'id'</span>])</span><br><span class="line">                    print(comment[<span class="string">'referenceName'</span>])</span><br><span class="line">                    print(comment[<span class="string">'content'</span>])</span><br><span class="line">                    sheet.range(<span class="string">'A'</span> + str(row)).value = comment[<span class="string">'referenceName'</span>]</span><br><span class="line">                    sheet.range(<span class="string">'B'</span> + str(row)).value = comment[<span class="string">'id'</span>]</span><br><span class="line">                    sheet.range(<span class="string">'C'</span> + str(row)).value = comment[<span class="string">'content'</span>]</span><br><span class="line">                print(<span class="string">'--------------------------------------------------------------------------------------------------------------'</span>)</span><br><span class="line"></span><br><span class="line">                <span class="comment"># for j in range(1,pages):</span></span><br><span class="line">                <span class="keyword">for</span> j <span class="keyword">in</span> range(<span class="number">1</span>, <span class="number">100</span>):</span><br><span class="line">                    <span class="comment"># pageComment = requests.get(</span></span><br><span class="line">                    <span class="comment">#     'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98'</span></span><br><span class="line">                    <span class="comment">#     'vv' + commentVersion + '&amp;productId=' + commodityId + '&amp;score=0&amp;sortType=5&amp;page='+str(j)+'&amp;pageSize=10'</span></span><br><span class="line">                    <span class="comment">#                                                           '&amp;isShadowSku=0&amp;rid=0&amp;fold=1', headers=headers).text</span></span><br><span class="line">                    pageComment = getHtml(</span><br><span class="line">                        <span class="string">'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98'</span></span><br><span class="line">                        <span class="string">'vv'</span> + commentVersion + <span class="string">'&amp;productId='</span> + commodityId + <span class="string">'&amp;score=0&amp;sortType=5&amp;page='</span>+str(j)+<span class="string">'&amp;pageSize=10'</span></span><br><span class="line">                                                                              <span class="string">'&amp;isShadowSku=0&amp;rid=0&amp;fold=1'</span>, headers=headers).text</span><br><span class="line">                    <span class="comment"># print('https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98'</span></span><br><span class="line">                    <span class="comment">#     'vv' + commentVersion + '&amp;productId=' + commodityId + '&amp;score=0&amp;sortType=5&amp;page='+str(j)+'&amp;pageSize=10'</span></span><br><span class="line">                    <span class="comment">#                                                           '&amp;isShadowSku=0&amp;rid=0&amp;fold=1')</span></span><br><span class="line">                    Text = re.findall(<span class="string">r'&#123;"productAttr"(.*?)&#125;\);'</span>, pageComment)[<span class="number">0</span>]</span><br><span class="line">                    Text = <span class="string">'&#123;"productAttr"'</span> + Text + <span class="string">'&#125;'</span></span><br><span class="line">                    jsonText = json.loads(Text)  <span class="comment"># 获得的json</span></span><br><span class="line">                    comments = jsonText[<span class="string">'comments'</span>]</span><br><span class="line">                    <span class="keyword">for</span> comment <span class="keyword">in</span> comments:</span><br><span class="line">                        row = sheet[<span class="string">'A1048576'</span>].end(<span class="string">'up'</span>).row + <span class="number">1</span><span class="comment">#追加数据的一行在目前填入的内容最大的行数下一行</span></span><br><span class="line">                        print(comment[<span class="string">'id'</span>])</span><br><span class="line">                        print(comment[<span class="string">'referenceName'</span>])</span><br><span class="line">                        print(comment[<span class="string">'content'</span>])</span><br><span class="line">                        sheet.range(<span class="string">'A'</span> + str(row)).value = comment[<span class="string">'referenceName'</span>]</span><br><span class="line">                        sheet.range(<span class="string">'B'</span> + str(row)).value = comment[<span class="string">'id'</span>]</span><br><span class="line">                        sheet.range(<span class="string">'C'</span> + str(row)).value = comment[<span class="string">'content'</span>]</span><br><span class="line">                    <span class="comment"># print(pageComment)</span></span><br><span class="line">                    file=open(<span class="string">'C:\\Users\\NICE\\Desktop\\recoder.json'</span>,<span class="string">'a'</span>)</span><br><span class="line">                    file.write(pageComment)</span><br><span class="line">                    file.close()</span><br><span class="line">                    time.sleep(<span class="number">3</span>)<span class="comment">#不休息会被封ip</span></span><br><span class="line"></span><br><span class="line"></span><br><span class="line">            list.append(commodityUrl)</span><br><span class="line">        <span class="keyword">except</span>:</span><br><span class="line">            <span class="keyword">pass</span></span><br><span class="line">    print(<span class="string">'--------------------------------------------------------------------------------------------------------------'</span>)</span><br><span class="line">    driver.find_element_by_link_text(<span class="string">u"下一页&gt;"</span>).click() <span class="comment">#点击下一页</span></span><br><span class="line">    time.sleep(<span class="number">4</span>)<span class="comment">#防止出错，暂停4s</span></span><br></pre></td></tr></table></figure>



<h4 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h4><p>京东商品评论爬虫总体来说，难点有以下几点：1.动态页面的解析 2.通过正则表达式匹配出需要的关键数据 3.分析京东网页get请求的规律，找到评论的json url 4.请求延时 。其中最重要的就是寻找规律，所谓爬虫，所见即所得，你看得到的总能爬到，不管反爬机制多厉害，网页是要呈现给用户的，因此留给爬虫的机会就多了，只要发送请求，url就可获取。</p>
<p>本爬虫程序大致流程为：</p>
<p>用户输入商品的名称&gt;解析中文&gt;&gt;selenium模块进入首页&gt;&gt;获取首页所有商品名称及url&gt;&gt;进入每个商品的url&gt;&gt;获取商品页的commentVersion id与商品id以及评论数目&gt;&gt;</p>

  </div>
</article>



        
          <div id="footer-post-container">
  <div id="footer-post">

    <div id="nav-footer" style="display: none">
      <ul>
         
          <li><a href="/">Home</a></li>
         
          <li><a href="/about/">About</a></li>
         
          <li><a href="/archives/">Writing</a></li>
         
          <li><a href="/projects_url">Projects</a></li>
        
      </ul>
    </div>

    <div id="toc-footer" style="display: none">
      <ol class="toc"><li class="toc-item toc-level-1"><a class="toc-link" href="#京东爬虫python-selenium-requests"><span class="toc-number">1.</span> <span class="toc-text">京东爬虫python+selenium+requests</span></a><ol class="toc-child"><li class="toc-item toc-level-6"><a class="toc-link" href="#作者-ecin520"><span class="toc-number">1.0.0.0.0.1.</span> <span class="toc-text">作者-ecin520</span></a></li></ol></li><li class="toc-item toc-level-5"><a class="toc-link" href="#我们通过一个name保存商品名称"><span class="toc-number">1.0.0.0.1.</span> <span class="toc-text">我们通过一个name保存商品名称</span></a></li></ol></li><li class="toc-item toc-level-4"><a class="toc-link" href="#selenium模块介绍"><span class="toc-number">1.0.0.1.</span> <span class="toc-text">selenium模块介绍</span></a><ol class="toc-child"><li class="toc-item toc-level-6"><a class="toc-link" href="#selenium安装"><span class="toc-number">1.0.0.1.0.1.</span> <span class="toc-text">selenium安装</span></a></li></ol></li></ol></li><li class="toc-item toc-level-4"><a class="toc-link" href="#开始爬虫之旅"><span class="toc-number">1.0.0.2.</span> <span class="toc-text">开始爬虫之旅</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#分析json文本"><span class="toc-number">1.0.0.3.</span> <span class="toc-text">分析json文本</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#xlwings模块导入excel中"><span class="toc-number">1.0.0.4.</span> <span class="toc-text">xlwings模块导入excel中</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#去重"><span class="toc-number">1.0.0.5.</span> <span class="toc-text">去重</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#源代码"><span class="toc-number">1.0.0.6.</span> <span class="toc-text">源代码</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#总结"><span class="toc-number">1.0.0.7.</span> <span class="toc-text">总结</span></a></li></ol></li></ol></li></ol></li></ol>
    </div>

    <div id="share-footer" style="display: none">
      <ul>
  <li><a class="icon" href="http://www.facebook.com/sharer.php?u=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/" target="_blank" rel="noopener"><i class="fab fa-facebook fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://twitter.com/share?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&text=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-twitter fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://www.linkedin.com/shareArticle?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-linkedin fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://pinterest.com/pin/create/bookmarklet/?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&is_video=false&description=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-pinterest fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="mailto:?subject=京东爬虫python+selenium+requests&body=Check out this article: http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/" target="_blank" rel="noopener"><i class="fas fa-envelope fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://getpocket.com/save?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-get-pocket fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://reddit.com/submit?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-reddit fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://www.stumbleupon.com/submit?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-stumbleupon fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://digg.com/submit?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&title=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-digg fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="http://www.tumblr.com/share/link?url=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&name=京东爬虫python+selenium+requests&description=" target="_blank" rel="noopener"><i class="fab fa-tumblr fa-lg" aria-hidden="true"></i></a></li>
  <li><a class="icon" href="https://news.ycombinator.com/submitlink?u=http://yoursite.com/2019/09/29/%E4%BA%AC%E4%B8%9C%E7%88%AC%E8%99%ABpython+selenium+requests/&t=京东爬虫python+selenium+requests" target="_blank" rel="noopener"><i class="fab fa-hacker-news fa-lg" aria-hidden="true"></i></a></li>
</ul>

    </div>

    <div id="actions-footer">
        <a id="menu" class="icon" href="#" onclick="$('#nav-footer').toggle();return false;"><i class="fas fa-bars fa-lg" aria-hidden="true"></i> Menu</a>
        <a id="toc" class="icon" href="#" onclick="$('#toc-footer').toggle();return false;"><i class="fas fa-list fa-lg" aria-hidden="true"></i> TOC</a>
        <a id="share" class="icon" href="#" onclick="$('#share-footer').toggle();return false;"><i class="fas fa-share-alt fa-lg" aria-hidden="true"></i> Share</a>
        <a id="top" style="display:none" class="icon" href="#" onclick="$('html, body').animate({ scrollTop: 0 }, 'fast');"><i class="fas fa-chevron-up fa-lg" aria-hidden="true"></i> Top</a>
    </div>

  </div>
</div>

        
        <footer id="footer">
  <div class="footer-left">
    Copyright &copy; 2020 ECIN520
  </div>
  <div class="footer-right">
    <nav>
      <ul>
         
          <li><a href="/">Home</a></li>
         
          <li><a href="/about/">About</a></li>
         
          <li><a href="/archives/">Writing</a></li>
         
          <li><a href="/projects_url">Projects</a></li>
        
      </ul>
    </nav>
  </div>
</footer>

    </div>
    <!-- styles -->
<link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">
<link rel="stylesheet" href="/lib/justified-gallery/css/justifiedGallery.min.css">

    <!-- jquery -->
<script src="/lib/jquery/jquery.min.js"></script>
<script src="/lib/justified-gallery/js/jquery.justifiedGallery.min.js"></script>
<!-- clipboard -->

  <script src="/lib/clipboard/clipboard.min.js"></script>
  <script type="text/javascript">
  $(function() {
    // copy-btn HTML
    var btn = "<span class=\"btn-copy tooltipped tooltipped-sw\" aria-label=\"Copy to clipboard!\">";
    btn += '<i class="far fa-clone"></i>';
    btn += '</span>'; 
    // mount it!
    $(".highlight table").before(btn);
    var clip = new ClipboardJS('.btn-copy', {
      text: function(trigger) {
        return Array.from(trigger.nextElementSibling.querySelectorAll('.code')).reduce((str,it)=>str+it.innerText+'\n','')
      }
    });
    clip.on('success', function(e) {
      e.trigger.setAttribute('aria-label', "Copied!");
      e.clearSelection();
    })
  })
  </script>

<script src="/js/main.js"></script>
<!-- search -->

<!-- Google Analytics -->

<!-- Baidu Analytics -->

<!-- Disqus Comments -->


</body>
</html>
